NYC Traffic Accidents - Exploratory Data Analysis (Python)¶
In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
# pio.renderers.default = "notebook_connected"
clean_path = "../data/nyc_traffic_accidents_clean.csv"
df = pd.read_csv(clean_path)
df.shape, df.dtypes.head()
Out[1]:
((74881, 28), crash_date object borough object zip_code float64 latitude float64 longitude float64 dtype: object)
1. Monthly Trends¶
In [2]:
monthly = (
df.groupby(["month", "month_name"])
.agg(
total_crashes=("collision_id", "count"),
fatal_crashes=("is_fatal", "sum")
)
.reset_index()
.sort_values("month")
)
total_fatal_crashes = sum(monthly["fatal_crashes"])
monthly["fatal_rate_pct"] = round((monthly["fatal_crashes"] / total_fatal_crashes) * 100, 2)
monthly
Out[2]:
| month | month_name | total_crashes | fatal_crashes | fatal_rate_pct | |
|---|---|---|---|---|---|
| 0 | 1 | January | 14287 | 17 | 12.59 |
| 1 | 2 | February | 13684 | 20 | 14.81 |
| 2 | 3 | March | 11057 | 8 | 5.93 |
| 3 | 4 | April | 4116 | 13 | 9.63 |
| 4 | 5 | May | 6149 | 12 | 8.89 |
| 5 | 6 | June | 7616 | 30 | 22.22 |
| 6 | 7 | July | 9225 | 20 | 14.81 |
| 7 | 8 | August | 8747 | 15 | 11.11 |
In [3]:
fig = go.Figure()
# Bars: total crashes
fig.add_trace(go.Bar(
x=monthly["month_name"],
y=monthly["total_crashes"],
name="Total Crashes"
))
# Line: fatality rate
fig.add_trace(go.Scatter(
x=monthly["month_name"],
y=monthly["fatal_rate_pct"],
name="Share of Fatal Crashes (%)",
mode="lines+markers",
yaxis="y2"
))
fig.update_layout(
title="Monthly Collision Trends (Jan-Aug 2020)",
xaxis_title="Month",
yaxis_title="Number of Crashes",
yaxis2=dict(
title="Share (%)",
overlaying="y",
side="right"
),
barmode="group",
template="plotly_white"
)
fig.show()
2. Day-of-Week Analysis¶
In [4]:
weekday_order = [
"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"
]
df["day_of_week"] = pd.Categorical(
df["day_of_week"],
categories=weekday_order,
ordered=True
)
dow = (
df.groupby("day_of_week", observed=True)
.agg(
total_crashes=("collision_id", "count"),
fatal_crashes=("is_fatal", "sum")
)
.reset_index()
.sort_values("day_of_week")
)
dow["fatal_rate_pct"] = round((dow["fatal_crashes"] / total_fatal_crashes) * 100, 2)
dow
Out[4]:
| day_of_week | total_crashes | fatal_crashes | fatal_rate_pct | |
|---|---|---|---|---|
| 0 | Sunday | 9003 | 22 | 16.30 |
| 1 | Monday | 10511 | 16 | 11.85 |
| 2 | Tuesday | 10613 | 25 | 18.52 |
| 3 | Wednesday | 10638 | 15 | 11.11 |
| 4 | Thursday | 11244 | 13 | 9.63 |
| 5 | Friday | 12271 | 16 | 11.85 |
| 6 | Saturday | 10601 | 28 | 20.74 |
In [5]:
fig = go.Figure()
# Bars: total crashes
fig.add_trace(go.Bar(
x=dow["day_of_week"],
y=dow["total_crashes"],
name="Total Crashes"
))
# Line: fatality rate
fig.add_trace(go.Scatter(
x=dow["day_of_week"],
y=dow["fatal_rate_pct"],
name="Share of Fatal Crashes (%)",
mode="lines+markers",
yaxis="y2"
))
fig.update_layout(
title="Crashes by Day of Week",
xaxis_title="Day of Week",
yaxis_title="Number of Crashes",
yaxis2=dict(
title="Share (%)",
overlaying="y",
side="right"
),
template="plotly_white"
)
fig.show()
3. Hour-of-Day Patterns¶
In [6]:
hourly = (
df.groupby("hour")
.agg(
total_crashes=("collision_id", "count"),
fatal_crashes=("is_fatal", "sum")
)
.reset_index()
.sort_values("hour")
)
hourly.head()
Out[6]:
| hour | total_crashes | fatal_crashes | |
|---|---|---|---|
| 0 | 0 | 2948 | 7 |
| 1 | 1 | 1474 | 7 |
| 2 | 2 | 1139 | 2 |
| 3 | 3 | 989 | 7 |
| 4 | 4 | 975 | 7 |
In [7]:
fig = go.Figure()
fig.add_trace(go.Scatter(
x=hourly["hour"],
y=hourly["total_crashes"],
name="Total Crashes",
mode="lines+markers"
))
fig.update_layout(
title="Crashes by Hour of Day",
xaxis=dict(
title="Hour of Day",
dtick=1,
tickvals=hourly["hour"]
),
yaxis_title="Total Crashes",
template="plotly_white"
)
fig.show()
In [8]:
weekday_df = df[~df["is_weekend"]]
weekend_df = df[df["is_weekend"]]
weekday_hourly = weekday_df.groupby("hour")["collision_id"].count() / weekday_df["crash_date"].nunique()
weekend_hourly = weekend_df.groupby("hour")["collision_id"].count() / weekend_df["crash_date"].nunique()
hourly_avg = pd.DataFrame({
"hour": range(24),
"weekday_avg": weekday_hourly.reindex(range(24), fill_value=0).values,
"weekend_avg": weekend_hourly.reindex(range(24), fill_value=0).values
})
hourly_avg.head()
Out[8]:
| hour | weekday_avg | weekend_avg | |
|---|---|---|---|
| 0 | 0 | 11.052023 | 15.014493 |
| 1 | 1 | 4.809249 | 9.304348 |
| 2 | 2 | 3.520231 | 7.681159 |
| 3 | 3 | 2.907514 | 7.043478 |
| 4 | 4 | 3.069364 | 6.434783 |
In [9]:
fig = go.Figure()
fig.add_trace(go.Scatter(
x=hourly_avg["hour"],
y=hourly_avg["weekday_avg"],
name="Weekday Avg",
mode="lines+markers"
))
fig.add_trace(go.Scatter(
x=hourly_avg["hour"],
y=hourly_avg["weekend_avg"],
name="Weekend Avg",
mode="lines+markers"
))
fig.update_layout(
title="Average Crashes by Hour",
xaxis=dict(
title="Hour of Day",
dtick=1,
tickvals=hourly_avg["hour"]
),
yaxis_title="Avg Crashes per Day",
template="plotly_white"
)
fig.show()
In [10]:
radar_df = hourly_avg.copy()
radar_df.loc[24] = radar_df.loc[0] # close the loop
radar_df["hour_label"] = radar_df["hour"].astype(int).astype(str)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=radar_df["weekday_avg"],
theta=radar_df["hour_label"],
name="Weekday Avg",
))
fig.add_trace(go.Scatterpolar(
r=radar_df["weekend_avg"],
theta=radar_df["hour_label"],
name="Weekend Avg",
))
fig.update_layout(
title="Radar Chart - Weekday vs Weekend Hourly Patterns",
template="plotly_white",
polar=dict(
radialaxis=dict(visible=True),
angularaxis=dict(direction="clockwise")
)
)
fig.show()
4. Day × Hour Heatmap¶
In [11]:
day_hour = (
df.groupby(["day_of_week", "hour"], observed=True)["collision_id"]
.count()
.reset_index()
.pivot(index="day_of_week", columns="hour", values="collision_id")
.reindex(index=weekday_order) # just to be explicit
)
all_hours = list(range(24))
day_hour = day_hour.reindex(columns=all_hours, fill_value=0)
day_hour
Out[11]:
| hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| day_of_week | |||||||||||||||||||||
| Sunday | 510 | 312 | 271 | 254 | 232 | 195 | 182 | 165 | 231 | 241 | ... | 615 | 537 | 566 | 539 | 526 | 462 | 407 | 356 | 365 | 292 |
| Monday | 385 | 200 | 126 | 112 | 117 | 142 | 310 | 407 | 591 | 552 | ... | 682 | 690 | 746 | 736 | 658 | 494 | 373 | 325 | 278 | 219 |
| Tuesday | 333 | 134 | 88 | 96 | 78 | 159 | 294 | 433 | 643 | 576 | ... | 719 | 654 | 799 | 750 | 693 | 509 | 444 | 326 | 339 | 264 |
| Wednesday | 367 | 154 | 129 | 106 | 103 | 176 | 258 | 407 | 563 | 509 | ... | 741 | 730 | 756 | 743 | 688 | 520 | 402 | 378 | 366 | 259 |
| Thursday | 391 | 144 | 116 | 78 | 118 | 148 | 316 | 445 | 679 | 593 | ... | 722 | 683 | 814 | 778 | 703 | 556 | 454 | 399 | 340 | 283 |
| Friday | 436 | 200 | 150 | 111 | 115 | 178 | 315 | 383 | 648 | 638 | ... | 815 | 781 | 870 | 804 | 800 | 628 | 574 | 472 | 430 | 428 |
| Saturday | 526 | 330 | 259 | 232 | 212 | 180 | 193 | 223 | 323 | 330 | ... | 722 | 602 | 668 | 624 | 628 | 569 | 484 | 455 | 439 | 419 |
7 rows × 24 columns
In [12]:
fig = px.imshow(
day_hour,
labels=dict(
x="Hour of Day",
y="Day of Week",
color="Number of Crashes"
),
x=day_hour.columns,
y=day_hour.index,
aspect="auto",
color_continuous_scale="Reds"
)
fig.update_layout(
title="Crashes Heatmap - Day of Week × Hour of Day",
xaxis=dict(
tickvals=day_hour.columns
),
template="plotly_white"
)
fig.show()
5. High-Risk Locations¶
In [13]:
streets = df[df["on_street_name"].notna()].copy()
streets["on_street_name"] = streets["on_street_name"].str.strip()
top_streets = (
streets.groupby("on_street_name")
.agg(total_crashes=("collision_id", "count"))
.reset_index()
.sort_values("total_crashes", ascending=False)
.head(10)
)
# Add % of total collisions
total_crashes_citywide = len(df)
top_streets["pct_of_total"] = round((top_streets["total_crashes"] / total_crashes_citywide) * 100, 2)
top_streets
Out[13]:
| on_street_name | total_crashes | pct_of_total | |
|---|---|---|---|
| 942 | BELT PARKWAY | 1241 | 1.66 |
| 2403 | LONG ISLAND EXPRESSWAY | 745 | 0.99 |
| 1051 | BROOKLYN QUEENS EXPRESSWAY | 738 | 0.99 |
| 1834 | FDR DRIVE | 728 | 0.97 |
| 2455 | MAJOR DEEGAN EXPRESSWAY | 591 | 0.79 |
| 1976 | GRAND CENTRAL PKWY | 581 | 0.78 |
| 1031 | BROADWAY | 575 | 0.77 |
| 742 | ATLANTIC AVENUE | 532 | 0.71 |
| 1322 | CROSS BRONX EXPY | 526 | 0.70 |
| 1325 | CROSS ISLAND PARKWAY | 512 | 0.68 |
In [14]:
fig = go.Figure()
fig.add_trace(go.Bar(
x=top_streets["pct_of_total"],
y=top_streets["on_street_name"],
orientation="h",
name="Share of Total Collisions (%)",
text=top_streets["pct_of_total"].astype(str) + "%",
textposition="outside"
))
fig.update_layout(
title="Top 10 Streets by Collision Frequency",
xaxis_title="Share of Total Collisions (%)",
yaxis_title="Street",
template="plotly_white",
)
fig.show()
6. Contributing Factors¶
In [15]:
cf = df.copy()
# normalize strings
cf["contributing_factor_vehicle_1"] = (
cf["contributing_factor_vehicle_1"]
.fillna("Unspecified")
.str.strip()
)
# eliminate weird placeholders (Excel often had blanks)
cf["contributing_factor_vehicle_1"] = (
cf["contributing_factor_vehicle_1"]
.replace("", "Unspecified")
.fillna("Unspecified")
)
cf_clean = cf[cf["contributing_factor_vehicle_1"] != "Unspecified"]
In [16]:
cf_all = (
cf_clean.groupby("contributing_factor_vehicle_1")
.agg(total_crashes=("collision_id", "count"))
.reset_index()
.sort_values("total_crashes", ascending=False)
.head(10)
)
total_crashes_citywide = len(df)
cf_all["pct_of_total"] = ((cf_all["total_crashes"] / total_crashes_citywide) * 100).astype(int)
cf_all
Out[16]:
| contributing_factor_vehicle_1 | total_crashes | pct_of_total | |
|---|---|---|---|
| 8 | Driver Inattention/Distraction | 19123 | 25 |
| 17 | Following Too Closely | 5202 | 6 |
| 14 | Failure to Yield Right-of-Way | 4815 | 6 |
| 32 | Passing or Lane Usage Improper | 2840 | 3 |
| 4 | Backing Unsafely | 2829 | 3 |
| 31 | Passing Too Closely | 2687 | 3 |
| 27 | Other Vehicular | 2233 | 2 |
| 49 | Unsafe Speed | 2015 | 2 |
| 48 | Unsafe Lane Changing | 1809 | 2 |
| 46 | Traffic Control Disregarded | 1544 | 2 |
In [17]:
fig = px.treemap(
cf_all,
path=["contributing_factor_vehicle_1"],
values="pct_of_total",
color="pct_of_total",
color_continuous_scale="Blues",
title="Top 10 Contributing Factors - Share of Total Collisions (%)"
)
fig.data[0].texttemplate="%{label}<br>%{value}%"
fig.show()
In [18]:
cf_fatal = (
cf_clean[cf_clean["is_fatal"]]
.groupby("contributing_factor_vehicle_1")
.agg(fatal_crashes=("collision_id", "count"))
.reset_index()
.sort_values("fatal_crashes", ascending=False)
)
total_fatal = cf_fatal["fatal_crashes"].sum()
cf_fatal["pct_of_fatal"] = ((cf_fatal["fatal_crashes"] / total_fatal) * 100).astype(int)
cf_fatal.head(10)
Out[18]:
| contributing_factor_vehicle_1 | fatal_crashes | pct_of_fatal | |
|---|---|---|---|
| 16 | Unsafe Speed | 33 | 32 |
| 14 | Traffic Control Disregarded | 15 | 14 |
| 3 | Driver Inattention/Distraction | 13 | 12 |
| 6 | Failure to Yield Right-of-Way | 10 | 9 |
| 13 | Pedestrian/Bicyclist/Other Pedestrian Error/Co... | 6 | 5 |
| 9 | Illnes | 5 | 4 |
| 4 | Driver Inexperience | 4 | 3 |
| 17 | View Obstructed/Limited | 3 | 2 |
| 2 | Backing Unsafely | 3 | 2 |
| 1 | Alcohol Involvement | 2 | 1 |
In [28]:
fig = px.treemap(
cf_fatal.head(5),
path=["contributing_factor_vehicle_1"],
values="pct_of_fatal",
color="pct_of_fatal",
color_continuous_scale="Reds",
title="Top Contributing Factors - Fatal Collisions (%)"
)
fig.data[0].texttemplate="%{label}<br>%{value}%"
fig.show()
7. Vehicle Types & Victims¶
In [20]:
vt = df.copy()
vt["vehicle_type_code_1"] = vt["vehicle_type_code_1"].fillna("Unspecified").str.strip()
vt_all = (
vt.groupby("vehicle_type_code_1")
.agg(total_crashes=("collision_id", "count"))
.reset_index()
.sort_values("total_crashes", ascending=False)
)
total_crashes = len(df)
vt_all["pct_of_total"] = round(vt_all["total_crashes"] / total_crashes * 100, 1)
vt_all_clean = vt_all[vt_all["vehicle_type_code_1"] != "Unspecified"]
vt_all_clean.head(10)
Out[20]:
| vehicle_type_code_1 | total_crashes | pct_of_total | |
|---|---|---|---|
| 179 | Sedan | 34349 | 45.9 |
| 182 | Station Wagon/Sport Utility Vehicle | 27541 | 36.8 |
| 196 | Taxi | 2768 | 3.7 |
| 162 | Pick-up Truck | 1882 | 2.5 |
| 19 | Box Truck | 1417 | 1.9 |
| 22 | Bus | 950 | 1.3 |
| 17 | Bike | 882 | 1.2 |
| 203 | Tractor Truck Diesel | 587 | 0.8 |
| 138 | Motorcycle | 518 | 0.7 |
| 226 | Van | 430 | 0.6 |
In [21]:
vt_fatal = (
vt[vt["is_fatal"]]
.groupby("vehicle_type_code_1")
.agg(fatal_crashes=("collision_id", "count"))
.reset_index()
.sort_values("fatal_crashes", ascending=False)
)
total_fatal = vt_fatal["fatal_crashes"].sum()
vt_fatal["pct_of_fatal"] = round(vt_fatal["fatal_crashes"] / total_fatal * 100, 1)
vt_fatal_clean = vt_fatal[vt_fatal["vehicle_type_code_1"] != "Unspecified"]
vt_fatal_clean
Out[21]:
| vehicle_type_code_1 | fatal_crashes | pct_of_fatal | |
|---|---|---|---|
| 14 | Sedan | 45 | 33.3 |
| 15 | Station Wagon/Sport Utility Vehicle | 33 | 24.4 |
| 12 | Motorcycle | 20 | 14.8 |
| 16 | Taxi | 5 | 3.7 |
| 4 | Bus | 4 | 3.0 |
| 13 | Pick-up Truck | 3 | 2.2 |
| 3 | Box Truck | 3 | 2.2 |
| 8 | E-Scooter | 3 | 2.2 |
| 10 | Flat Rack | 2 | 1.5 |
| 17 | Tractor Truck Diesel | 2 | 1.5 |
| 6 | Convertible | 2 | 1.5 |
| 7 | Dump | 2 | 1.5 |
| 0 | AMBULANCE | 1 | 0.7 |
| 11 | Motorbike | 1 | 0.7 |
| 1 | Ambulance | 1 | 0.7 |
| 9 | FORK LIFT | 1 | 0.7 |
| 5 | Concrete Mixer | 1 | 0.7 |
| 2 | Bike | 1 | 0.7 |
| 19 | Van | 1 | 0.7 |
In [22]:
vt_merge = vt_all_clean.merge(vt_fatal_clean, on="vehicle_type_code_1", how="left")
vt_merge["fatal_crashes"] = vt_merge["fatal_crashes"].fillna(0)
vt_merge["pct_of_fatal"] = vt_merge["pct_of_fatal"].fillna(0)
vt_merge.head(10)
Out[22]:
| vehicle_type_code_1 | total_crashes | pct_of_total | fatal_crashes | pct_of_fatal | |
|---|---|---|---|---|---|
| 0 | Sedan | 34349 | 45.9 | 45.0 | 33.3 |
| 1 | Station Wagon/Sport Utility Vehicle | 27541 | 36.8 | 33.0 | 24.4 |
| 2 | Taxi | 2768 | 3.7 | 5.0 | 3.7 |
| 3 | Pick-up Truck | 1882 | 2.5 | 3.0 | 2.2 |
| 4 | Box Truck | 1417 | 1.9 | 3.0 | 2.2 |
| 5 | Bus | 950 | 1.3 | 4.0 | 3.0 |
| 6 | Bike | 882 | 1.2 | 1.0 | 0.7 |
| 7 | Tractor Truck Diesel | 587 | 0.8 | 2.0 | 1.5 |
| 8 | Motorcycle | 518 | 0.7 | 20.0 | 14.8 |
| 9 | Van | 430 | 0.6 | 1.0 | 0.7 |
In [23]:
fig = go.Figure()
fig.add_trace(go.Bar(
x=vt_merge["vehicle_type_code_1"].head(10),
y=vt_merge["pct_of_total"].head(10),
name="% of Total Collisions",
text=vt_merge["pct_of_total"].round(1).astype(str) + "%",
textposition="outside"
))
fig.add_trace(go.Bar(
x=vt_merge["vehicle_type_code_1"].head(10),
y=vt_merge["pct_of_fatal"].head(10),
name="% of Fatal Collisions",
text=vt_merge["pct_of_fatal"].round(1).astype(str) + "%",
textposition="outside"
))
fig.update_layout(
title="Vehicle Types Involved — Total vs Fatal Collisions",
xaxis_title="Vehicle Type",
yaxis_title="Percentage (%)",
barmode="group",
template="plotly_white",
yaxis=dict(range=[0, max(vt_merge["pct_of_total"].max(), vt_merge["pct_of_fatal"].max()) * 1.2]),
)
fig.show()
In [24]:
injured = pd.DataFrame({
"category": ["Motorists", "Pedestrians", "Cyclists"],
"count": [
df["number_of_motorist_injured"].sum(),
df["number_of_pedestrians_injured"].sum(),
df["number_of_cyclist_injured"].sum()
]
})
In [25]:
fig = px.pie(
injured,
names="category",
values="count",
title="Distribution of Injured People",
color_discrete_sequence=px.colors.sequential.Blues
)
fig.show()
In [26]:
killed = pd.DataFrame({
"category": ["Motorists", "Pedestrians", "Cyclists"],
"count": [
df["number_of_motorist_killed"].sum(),
df["number_of_pedestrians_killed"].sum(),
df["number_of_cyclist_killed"].sum()
]
})
In [27]:
fig = px.pie(
killed,
names="category",
values="count",
title="Distribution of Fatalities",
color_discrete_sequence=px.colors.sequential.Reds
)
fig.show()